# Commands to start an interactive session on the JHPCE cluster
qrsh -l mem_free=20G,h_vmem=20G
module load conda_R
cd /fastscratch/myscratch/akuo/alsf-filbin
R
library(here)
dataset = "premrna" # options = "premrna_mrna", "mrna", or "premrna"

Create SummarizedExperiment object

First, we create a table with information about where each file (quantified counts from salmon) is. This will be used to create a SummarizedExperiment object.

# list tumor names
tumor_names <- list.files(here("sample_data"))[
                  !grepl("*.txt", list.files(here("sample_data")))]

unique_sf_paths <- NULL
for(tum in tumor_names){
  ids <- list.files(here("sample_data", tum))
  ids <- unique(stringr::str_sub(ids, end=-13))
  ids <- here("salmon_quants", paste0(ids, "_quant"), "quant.sf")
  unique_sf_paths <- c(unique_sf_paths, ids)
}

unique_sf_ids <- NULL
for(tum in tumor_names){
  ids <- list.files(here("sample_data", tum))
  ids <- unique(stringr::str_sub(ids, end=-13))
  unique_sf_ids <- c(unique_sf_ids, ids)
}

coldata <- data.frame(files = unique_sf_paths, names = unique_sf_ids)

We also need to use the linkedTxome object to use tximeta properly, i.e. rowRanges(se) won’t be NULL and tximeta will be able to match the transcripts to the genes for us. Note: still doesn’t work

suppressPackageStartupMessages({
  library(tximeta)
  library(BiocFileCache)
})

# check if linkedTxome is already in the cache
bfcloc <- getTximetaBFC()
bfc <- BiocFileCache(bfcloc)
bfcinfo(bfc)

# if not, load linkedTxome json file
json_file <- here("salmon_files", "gencode.v32_salmon-index-v1.0.0.json")
loadLinkedTxome(json_file)

The only way I’ve figured out how to make this work for now is with skipMeta = TRUE.

se_file_name = here("salmon_quants", paste0("se_", dataset, ".rds"))

# coldata = coldata[1:2, ] # for testing
if(!file.exists(se_file_name)){
  # Create SummarizedExperiment object
  se <- tximeta(coldata, skipMeta = TRUE) # Takes a few minutes, file size = 597 MB
  # se <- tximeta(coldata) # run this line if you used mRNA transcripts in your index only, it will automatically detect the right transcriptome
  # se <- tximeta(coldata, ignoreAfterBar = TRUE)
  saveRDS(se, se_file_name)
} else {
  se = readRDS(se_file_name) # Takes a couple of seconds
}
suppressPackageStartupMessages({
  library(SummarizedExperiment)
  library(DESeq2)
})

colData(se)
assayNames(se)
rowRanges(se)

dat = assay(se, "counts")

Create SingleCellExperiment object

A SingleCellExperiment class is derived from the SummarizedExperiment class. The most important change is the addition of a new slot called reducedDims. Read more here.

# BiocManager::install('SingleCellExperiment')
library(SingleCellExperiment)
sce = SingleCellExperiment(assays = list(counts = dat))

# you can access counts by assay(sce, "counts") or counts(sce)
# you can add a new entry to assays slot by assay(sce, "counts_new") = dat_new

Quality Control

There are a couple of QC metrics to identify low-quality cells:

  1. Using counts, i.e. cells with (a) a small library size (total sum of counts) low_lib_size or (b) few expressed endogeneous genes (nonzero counts for those genes) low_n_features
  2. Using “spike-in transcripts”, i.e. any enrichment of spike-in transcripts (higher proportion)
  3. Using the mitochondrial genome, i.e. any enrichment of reads in the mitochondrial genome is indicative of loss of cytoplasmic RNA

I will only do (1) for now.

library(scater)

# Compute quality control metrics:
# sum is the total count for each cell
# detected contains the number of detected genes (actually transcripts for our data)
df = perCellQCMetrics(sce)
df

# Find outliers with low library sizes (LibSize) and few detected features (n_features)
reasons = quickPerCellQC(df) # DataFrame of logical values
colSums(as.matrix(reasons))

# Discard outliers
filtered = sce[, !reasons$discard] 
dat_filtered = counts(filtered) # 226608 x 518 for mrna, 221988 x 546 for premrn
sce_filtered = SingleCellExperiment(assays = list(counts = dat_filtered))

filtered_file_name = here("salmon_quants", paste0("sce_filtered_", dataset, ".rds"))
saveRDS(sce_filtered, filtered_file_name)

Diagnostic plots: https://osca.bioconductor.org/quality-control.html#quality-control-plots

Transform read counts

library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
## Warning: package 'SummarizedExperiment' was built under R version 3.6.2
## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## Warning: package 'DelayedArray' was built under R version 3.6.2
## Warning: package 'BiocParallel' was built under R version 3.6.2
dataset = "mrna"

Convert read counts to pseudo-UMIs

Run compute-mle.sh to get mle sig (“shape”) parameter first. There will be a parameter for every cell. It will take about 30 minutes to an hour.

# Load filtered counts
filtered_file_name = here("salmon_quants", paste0("sce_filtered_", dataset, ".rds"))
if(file.exists(filtered_file_name)){
  sce_filtered = readRDS(filtered_file_name)
  dat_filtered = counts(sce_filtered)
}

# Read in mle parameters
mle_file_names = list.files(here("mle_results"), full.names = TRUE)
mle_results = lapply(mle_file_names, readRDS)
sig_vec = sapply(mle_results, function(r) r["sig"])
mu_vec = sapply(mle_results, function(r) r["mu"])

# Plot sig ("shape") distribution
ggplot(tibble(sig = sig_vec), aes(x = sig)) +
  geom_histogram(bins = 30) +
  labs(title = "Distribution of sig parameter",
       x = "sig",
       y = "Number of cells")
getMode <- function(x){
  keys <- unique(x)
  keys[which.max(tabulate(match(x, keys)))]
}

source(here("scripts", "quminorm.R"))

# Convert to pseudo-UMIs
umi_file_name = here("salmon_quants", paste0("sce_umi_", dataset, ".rds"))
if(file.exists(umi_file_name)){
  dat_umi = readRDS(umi_file_name)
} else {
  dat_umi = quminorm_matrix(dat_filtered, shape = getMode(sig_vec)) # Takes several minutes (20~30 min). In the paper, they use the mode.
  dat_umi = dat_umi[!(rowSums(dat_umi) == 0), ] # Remove transcripts with all zeros
  sce_umi = SingleCellExperiment(assays = list(counts = dat_umi))
  saveRDS(sce_umi, umi_file_name)
}

Aggregate counts at gene level

umi_file_name = here("salmon_quants", paste0("sce_umi_", dataset, ".rds"))
sce_umi = readRDS(umi_file_name)
dat_umi = counts(sce_umi)

# Counts by transcript
dim(dat_umi) #  176307 x 518 for mRNA, 173927 x 546 for premRNA

if(dataset == "mrna"){
  se_mrna = readRDS(here("salmon_quants", paste0("se_mrna.rds")))
  transcripts_mrna = rownames(assay(se_mrna, "counts"))
  row_ranges = rowRanges(se)
  genes = unlist(elementMetadata(row_ranges)$gene_id)
  mrna_gene_matching_dt = tibble(transcript = transcripts_mrna, genes) %>%
    filter(transcript %in% rownames(dat_umi))
  dat_gene = rowsum(dat_umi, group = mrna_gene_matching_dt$genes, reorder = FALSE)
  dim(dat_gene) # 48890 x 518 for mRNA
} else if(dataset == "premrna") {
  # Read in mrna data to look up genes
  se_mrna = readRDS(here("salmon_quants", paste0("se_mrna.rds")))
  transcripts_mrna = rownames(assay(se_mrna, "counts"))
  row_ranges = rowRanges(se_mrna)
  genes = unlist(elementMetadata(row_ranges)$gene_id)
  mrna_gene_matching_dt = tibble(transcript = transcripts_mrna, genes)
  
  # Get genes for every premrna transcript
  # Note: 208 premRNA transcripts did not have corresponding mRNA transcripts
  transcripts_premrna = rownames(dat_umi)
  transcripts_premrna_converted = gsub(".{8}$", "" , transcripts_premrna)
  premrna_gene_matching_dt = left_join(tibble(transcript = transcripts_premrna_converted),
                                       mrna_gene_matching_dt, by = "transcript")
  
  dat_gene = rowsum(dat_umi, group = premrna_gene_matching_dt$genes, reorder = FALSE)
  # Remove counts for unmatched transcripts
  dat_gene = dat_gene[!is.na(rownames(dat_gene)), ]
  dim(dat_gene) # 50476 x 546
}

gene_file_name = here("salmon_quants", paste0("sce_gene_", dataset, ".rds"))
sce_gene = SingleCellExperiment(assays = list(counts = dat_gene))
saveRDS(sce_gene, gene_file_name)

Exploratory Plots

library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)

sce_prem = readRDS(here("salmon_quants", "sce_gene_premrna.rds"))
sce_mrna = readRDS(here("salmon_quants", "sce_gene_mrna.rds"))

dat_prem = counts(sce_prem)
dat_mrna = counts(sce_mrna)

Compare pre-mRNA vs mRNA counts

dat_prem_rowmeans = tibble(genes = rownames(dat_prem),
                           mean_prem = rowMeans(dat_prem))
dat_mrna_rowmeans = tibble(genes = rownames(dat_mrna),
                           mean_mrna = rowMeans(dat_mrna))
dat_rowmeans = full_join(dat_prem_rowmeans, dat_mrna_rowmeans, by = "genes")
dat_rowmeans %>%
  ggplot(aes(x = log(mean_mrna),
             y = log(mean_prem))) + 
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red") +
  labs(x = "Log(average gene expression with mRNA)",
       y = "Log(average gene expression with pre-mRNA)") +
  theme_bw()
## Warning: Removed 4924 rows containing missing values (geom_point).

Comparison to parametric distributions

pre-mRNA

First, I make all the column sums the same by scaling. The count in row \(i\) and column \(j\) is transformed as \(x_{ij} = x_{ij}*\sum_i x_{ij}/median_j(\sum_i x_{ij})\).

summary(colSums(dat_prem))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2101   18112   34582   41200   56534  309985
# Apply transformation
n = round(median(colSums(dat_prem)))
dat_prem = apply(dat_prem, MARGIN = 2, function(x) x*n/sum(x))

summary(colSums(dat_prem))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34582   34582   34582   34582   34582   34582

I plot the mean and variance for every row (transcript). Under a poisson distribution, they should be the same.

plot_meanvar = function(dat_sub){
  # estimate lambdas and variances for every transcript
  means = rowMeans(dat_sub)
  vars = apply(dat_sub, MARGIN = 1, var)
  
  # variance = mean under Poisson distribution
  tibble(means, vars) %>%
    ggplot(aes(x = log(means), y = log(vars))) +
    geom_point(alpha = 0.4) +
    geom_abline(intercept = 0, slope = 1)
}
plot_meanvar(dat_prem) +
  labs(title = "pre-mRNA")

I first compute the average expression for every row (x-axis) \(\hat{\mu}_i\) and the empirical \(P(X_i=0)\), which is the probability that for a given transcript \(i\), the count is 0.

I then compute what \(P(X_i=0)\) would be under the model assumptions of binomial or poisson, using parameters estimated from the data. In particular, for a \(Binom(n, p_i)\), \(n\) is the median number of total counts of cells and \(p_i\) is the mean proportion of counts that were in gene \(i\).

# Function to plot P(X_i = 0) against average expression level mu_i
plot_prob = function(dat_sub){
  n = round(median(colSums(dat_sub)))
  means = rowMeans(dat_sub)
  vars = apply(dat_sub, MARGIN = 1, var)
  
  # empirical P(X_i = 0)
  emp_probs_0 = apply(dat_sub, MARGIN = 1, function(r) sum(r==0)/ncol(dat_sub))
  plot_dt = tibble(means, emp_probs_0)
  
  emp_props = rowMeans(dat_sub/colSums(dat_sub))
  # Model P(X_i = 0) under Binomial
  binom_probs_0 = dbinom(x = 0, size = n, prob = emp_props)
  # Model P(X_i = 0) under Poisson
  poiss_probs_0 = dpois(x = 0, lambda = n*emp_props)
  # Model P(X_i = 0) under Negative Binomial
  # Estimate size/dispersion parameter
  model = lm(vars ~ 1*means + I(means^2) + 0, tibble(means, vars))
  phi = 1/coef(model)["I(means^2)"]
  nbinom_probs_0 = dnbinom(x = 0, size = phi, mu = n*emp_props) 
  
  # Tibble for plot
  plot_lines_dt = tibble(means = means,
                         binomial = binom_probs_0,
                         poisson = poiss_probs_0,
                         nbinomial = nbinom_probs_0) %>%
    pivot_longer(-means, names_to = "model", values_to = "probs_0")
  
  # Plot
  plt = plot_lines_dt %>%
    ggplot(aes(x = log(means), y = probs_0)) +
    geom_point(data = plot_dt, aes(x = log(means), y = emp_probs_0), alpha = 0.4) + # Add data points
    geom_line(aes(color = model),
              size = 1.5) + # Add lines for models
    labs(x = "Average expression level log(E(X_i))",
         y = "P(X_i = 0)") +
    theme(text = element_text(size = 15))
  
  # Return object
  out = list(plot = plt,
             lines_dt = plot_lines_dt)
  return(out)
}
# Plot P(X_i = 0) against average expression level
prob_out_prem = plot_prob(dat_prem)
prob_out_prem$plot +
  labs(title = "pre-mRNA")

mRNA

summary(colSums(dat_mrna))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1781   19750   42298   52722   71039  481647
# Apply transformation
n = round(median(colSums(dat_mrna)))
dat_mrna = apply(dat_mrna, MARGIN = 2, function(x) x*n/sum(x))

summary(colSums(dat_mrna))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   42298   42298   42298   42298   42298   42298
# Plot mean against variance
plot_meanvar(dat_mrna) +
  labs(title = "mRNA")

# Plot P(X_i = 0) against average expression level
prob_out_mrna = plot_prob(dat_mrna)
prob_out_mrna$plot +
  labs(title = "mRNA")

PCA

library(scran)
## Warning: package 'scran' was built under R version 3.6.2
library(BiocSingular)
## Warning: package 'BiocSingular' was built under R version 3.6.2
# library(factoextra)
library(tictoc)
library(glmpca)

pre-mRNA

sce = sce_prem
dat = dat_prem
# Cells in each cluster are normalized separately
clust = quickCluster(dat)
table(clust)
## clust
##   1   2   3   4 
## 178 140 109 119
deconv = computeSumFactors(dat, cluster = clust)
## Warning: use 'calculateSumFactors' for any 'x' that is not a
## SingleCellExperiment
X = dat %>%
  as.matrix() %>%
  sweep(., 2, deconv, "/") %>% # Divide by library size
  t()
X = log2(X + 1) # Apply log transformation

# Remove columns (transcripts) with zero variance
nonzero_var_cols = which(apply(X, 2, var) != 0)
X = X[, nonzero_var_cols]

# Run approximate PCA (8 seconds)
tic("approx PCA")
pc = runPCA(X, rank = 5, BSPARAM = IrlbaParam())
toc()
## approx PCA: 1.686 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(X, rank = 5, BSPARAM = RandomParam())
# toc()

# Run PCA with prcomp (~10 minutes)
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))

# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
  ggplot(aes(PC1, PC2, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
  ggplot(aes(PC1, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
  ggplot(aes(PC2, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
  ggplot(aes(PC4, PC5, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

mRNA

sce = sce_mrna
dat = dat_mrna
# Cells in each cluster are normalized separately
clust = quickCluster(dat)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
table(clust)
## clust
##   1   2   3 
## 176 169 173
deconv = computeSumFactors(dat, cluster = clust)
## Warning: use 'calculateSumFactors' for any 'x' that is not a
## SingleCellExperiment
X = dat %>%
  as.matrix() %>%
  sweep(., 2, deconv, "/") %>% # Divide by library size
  t()
X = log2(X + 1) # Apply log transformation

# Remove columns (transcripts) with zero variance
nonzero_var_cols = which(apply(X, 2, var) != 0)
X = X[, nonzero_var_cols]

# Run approximate PCA (8 seconds)
tic("approx PCA")
pc = runPCA(X, rank = 5, BSPARAM = IrlbaParam())
toc()
## approx PCA: 1.404 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(X, rank = 5, BSPARAM = RandomParam())
# toc()

# Run PCA with prcomp (~10 minutes)
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))

# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
  ggplot(aes(PC1, PC2, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
  ggplot(aes(PC1, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
  ggplot(aes(PC2, PC3, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
  ggplot(aes(PC4, PC5, color = color)) +
  geom_point(alpha = 0.5) +
  theme_bw()

GLM-PCA

# Takes a long time, may want to try on cluster
tic("glm PCA")
glmpca(dat %>% as.matrix(), L = 5, fam = c("poi"))
toc()

Clustering